We look for structural features that are frequently mutated and we compare them to the structural features that are usually switched. The interest is to look for splicing changes that are equivalent to mutation changes. We use the following parameters:
Mutation frequency: proportion of all the functional mutations that affect one particular feature. Measured taking into account only the features in the most abundant isoform of each gene.
Switch frequency: sum of all the patients that, per switch, have a feature affected.
Domain frequency: frequency of a feature in the proteome, counting the features in the most abundant isoform for each gene. Used to normalize the previous quantities.
We annotated the domains relatively highly mutated and highly switched (highly meaning > quantile 95).
feat_enrich <- read.delim(paste0(wd,'feature_enrichment.txt'), header=TRUE)
feat_enrich$MutFreq <- feat_enrich$MutIn/feat_enrich$AllMuts
feat_enrich$SwitchFreq <- feat_enrich$SwitchesIn/feat_enrich$AllSwitches
feat_enrich$DomainFrequency <- feat_enrich$DomainCount/feat_enrich$AllDomains
feat_enrich <- feat_enrich[feat_enrich$DomainFrequency != 0,]
for (cancer in cancerTypes){
cat(paste0(cancer,"\n"))
cancer.feat_enrich <- feat_enrich[feat_enrich$Cancer==cancer,]
# plot normalized mutation frequency and switched frequency
minMut <- quantile(cancer.feat_enrich$MutFreq/cancer.feat_enrich$DomainFrequency,0.95)
minSwitch <- quantile(cancer.feat_enrich$SwitchFreq/cancer.feat_enrich$DomainFrequency,0.95)
# plot raw data
r <- cor(cancer.feat_enrich$MutFreq/cancer.feat_enrich$DomainFrequency,
cancer.feat_enrich$SwitchFreq/cancer.feat_enrich$DomainFrequency)
p <- ggplot(cancer.feat_enrich,aes(MutFreq/DomainFrequency,SwitchFreq/DomainFrequency)) +
geom_point() +
geom_point(data=subset(cancer.feat_enrich, MutFreq/DomainFrequency > minMut & SwitchFreq/DomainFrequency > minSwitch),aes(MutFreq/DomainFrequency,SwitchFreq/DomainFrequency,color=Domain)) +
smartas_theme() +
geom_smooth(method=lm) +
geom_text(x=20,y=100,label=paste0("R = ",round(r,2)))
p <- direct.label(p)
print(p)
}
## brca
## Loading required package: proto
## coad
## hnsc
## kich
## kirc
## kirp
## lihc
## luad
## lusc
## prad
## thca
These plots leads to think that usually changes achieved by splicing are not achieved by mutations and viceversa. Still, there are some outlier domains that are a bit outside the L-shape, suggesting that some features can either be affected by any mechanism. To find out about those, we remove the features that have 0 on either SwitchFreq or MutFreq.
# plot log representations and remove those that have 0 on either side
for (cancer in cancerTypes){
cat(paste0(cancer,"\n"))
cancer.feat_enrich <- feat_enrich[feat_enrich$Cancer==cancer,]
# plot normalized mutation frequency and switched frequency
minMut <- quantile(cancer.feat_enrich$MutFreq/cancer.feat_enrich$DomainFrequency,0.95)
minSwitch <- quantile(cancer.feat_enrich$SwitchFreq/cancer.feat_enrich$DomainFrequency,0.95)
# filter data and get correlation
r.df <- subset(cancer.feat_enrich, MutFreq != 0 & SwitchFreq != 0)
r <- cor(r.df$MutFreq/r.df$DomainFrequency,r.df$SwitchFreq/r.df$DomainFrequency)
p <- ggplot(r.df,aes(MutFreq/DomainFrequency,SwitchFreq/DomainFrequency)) +
geom_point() +
geom_point(data=subset(r.df, MutFreq/DomainFrequency > minMut & SwitchFreq/DomainFrequency > minSwitch),aes(MutFreq/DomainFrequency,SwitchFreq/DomainFrequency,color=Domain)) +
smartas_theme() +
geom_smooth(method=lm) +
geom_text(x=6,y=20,label=paste0("R = ",round(r,2)))
p <- direct.label(p)
print(p)
}
## brca
## coad
## hnsc
## kich
## kirc
## kirp
## lihc
## luad
## lusc
## prad
## thca
Some degree of correlation appears, particularly in the kidney tumors (kich,kirc,kirp).
We apply the same process to aggregated data from all tumors.
feat_enrich_agg <- ddply(feat_enrich,.(Domain), summarise,
MutIn=sum(MutIn), AllMuts=sum(AllMuts),
SwitchesIn=sum(SwitchesIn), AllSwitches=sum(AllSwitches),
DomainCount=sum(DomainCount),AllDomains=sum(AllDomains))
feat_enrich_agg$MutFreq <- feat_enrich_agg$MutIn/feat_enrich_agg$AllMuts
feat_enrich_agg$SwitchFreq <- feat_enrich_agg$SwitchesIn/feat_enrich_agg$AllSwitches
feat_enrich_agg$DomainFrequency <- feat_enrich_agg$DomainCount/feat_enrich_agg$AllDomains
minMut <- quantile(feat_enrich_agg$MutFreq/feat_enrich_agg$DomainFrequency,0.95)
minSwitch <- quantile(feat_enrich_agg$SwitchFreq/feat_enrich_agg$DomainFrequency,0.95)
# plot original data
r <- cor(feat_enrich_agg$MutFreq/feat_enrich_agg$DomainFrequency,
feat_enrich_agg$SwitchFreq/feat_enrich_agg$DomainFrequency)
p <- ggplot(data=feat_enrich_agg,
aes(MutFreq/DomainFrequency,SwitchFreq/DomainFrequency)) +
geom_point() +
geom_point(data=subset(feat_enrich_agg,
MutFreq/DomainFrequency > minMut &
SwitchFreq/DomainFrequency > minSwitch),
aes(MutFreq/DomainFrequency,
SwitchFreq/DomainFrequency,
color=Domain)) +
smartas_theme() +
geom_smooth(method=lm) +
geom_text(x=20,y=100,label=paste0("R = ",round(r,2)))
p <- direct.label(p)
p
# plot log2 representations and remove those that have 0 on either side
r.df <- subset(feat_enrich_agg, MutFreq != 0 & SwitchFreq != 0)
r <- cor(r.df$MutFreq/r.df$DomainFrequency,r.df$SwitchFreq/r.df$DomainFrequency)
p <- ggplot(data=r.df,aes(MutFreq/DomainFrequency,SwitchFreq/DomainFrequency)) +
geom_point() +
geom_point(data=subset(r.df,MutFreq/DomainFrequency > minMut &
SwitchFreq/DomainFrequency > minSwitch),
aes(MutFreq/DomainFrequency,SwitchFreq/DomainFrequency,color=Domain)) +
smartas_theme() +
geom_smooth(method=lm) +
geom_text(x=6,y=20,label=paste0("R = ",round(r,2)))
p <- direct.label(p)
p
Similar results appear, but the correlation is non-existant, suggesting that the domains that need to be affected in each cancer type are tumor specific.
We wanted to know if the correlation improves if we aggregate the domains by function affected. So, maybe if domains that are only affected by mutations or by switches actually share some functions.
The frequencies are calculated as the quotient between the “in” counts (mutations in the feature, switches affecting the feature, of number of times that feature is observed in the proteome) and the total number of counts.
# read GO termns
prosite_annotation <- read.delim(paste0(wd,"prosite2go.clean.txt"),row.names = NULL,header=F)
pfam_annotation <- read.delim(paste0(wd,"Pfam2go.clean.txt"),row.names = NULL,header=F)
annotation <- rbind(prosite_annotation,pfam_annotation)
colnames(annotation) <- c("id","GO")
# remove the GO prefix from all of them
z <- gsub(";GO:[[:digit:]]+", "", annotation$GO)
z <- gsub("^GO:", "", z)
z <- paste0(toupper(substring(z, 1,1)),substring(z, 2))
annotation$GO <- z
# get the interpro id for the domains
feat_enrich_agg.go <- feat_enrich_agg
feat_enrich_agg.go$id <- unlist(strsplit(as.character(feat_enrich_agg.go$Domain),"|",fixed = T))[c(T,F)]
# merge the two tables. if a domain has several annotations, if will appear several times
feat_enrich_agg.go <- merge(feat_enrich_agg.go,annotation,all.x=T)
# aggregate by GO term and calculate the frequencies
feat_function_aggregated <- ddply(feat_enrich_agg.go,.(GO),summarise,MutIn=sum(MutIn),AllMuts = sum(AllMuts), SwitchesIn = sum(SwitchesIn), AllSwitches = sum(AllSwitches), DomainCount=sum(DomainCount),AllDomains=sum(AllDomains))
feat_function_aggregated$MutFreq <- feat_function_aggregated$MutIn/feat_function_aggregated$AllMuts
feat_function_aggregated$SwitchFreq <- feat_function_aggregated$SwitchesIn/feat_function_aggregated$AllSwitches
feat_function_aggregated$GOFrequency <- feat_function_aggregated$DomainCount/feat_function_aggregated$AllDomains
# plot original data
r <- cor(feat_function_aggregated$MutFreq/feat_function_aggregated$GOFrequency,feat_function_aggregated$SwitchFreq/feat_function_aggregated$GOFrequency)
p <- ggplot(data=feat_function_aggregated,aes(MutFreq/GOFrequency,SwitchFreq/GOFrequency)) +
geom_point() +
smartas_theme() +
geom_smooth(method=lm) +
geom_text(x=20,y=20,label=paste0("R = ",round(r,2)))
p
Same as before, there seems to be a mutual exclusion between functions affected by mutations and functions affected by alternative splicing.
r.df <- subset(feat_function_aggregated, SwitchFreq != 0 & MutFreq != 0)
r <- cor(r.df$MutFreq/r.df$GOFrequency,r.df$SwitchFreq/r.df$GOFrequency)
# show the extreme values
minMut <- quantile(r.df$MutFreq/r.df$GOFrequency,0.95)
minSwitch <- quantile(r.df$SwitchFreq/r.df$GOFrequency,0.95)
p <- ggplot(data=r.df,aes(MutFreq/GOFrequency,SwitchFreq/GOFrequency)) +
geom_point() +
geom_point(data=subset(r.df,MutFreq/GOFrequency > minMut &
SwitchFreq/GOFrequency > minSwitch),
aes(MutFreq/GOFrequency,SwitchFreq/GOFrequency,color=GO)) +
smartas_theme() +
geom_smooth(method=lm) +
geom_text(x=6,y=20,label=paste0("R = ",round(r,2)))
p <- direct.label(p)
p
Removing the cases with perfect mutual exclusion only stresses the L-shape of the distribution. Only the urocanate metabolism stands out, same as in previous analyses. This amino acid is part of the degradation of the histidine, and any relation to cancer progression is not evident.
The L-shaped plots, might mean there is a mutual exclusion between features affected by mutations and alternative splicing. However, it may be just an illusion due to the outliers that reduce the resolution. So we make the same plot removing the outliers, outlier meaning points more extreme than quantile 95 in either axis.
for (cancer in cancerTypes){
cat(paste0(cancer,"\n"))
cancer.feat_enrich <- feat_enrich[feat_enrich$Cancer==cancer,]
# plot normalized mutation frequency and switched frequency
mutOutliers <- quantile(cancer.feat_enrich$MutFreq/cancer.feat_enrich$DomainFrequency,0.95)
switchOutliers <- quantile(cancer.feat_enrich$SwitchFreq/cancer.feat_enrich$DomainFrequency,0.95)
# remove outliers
cancer.feat_enrich <- cancer.feat_enrich[cancer.feat_enrich$MutFreq/cancer.feat_enrich$DomainFrequency <= mutOutliers,]
cancer.feat_enrich <- cancer.feat_enrich[cancer.feat_enrich$SwitchFreq/cancer.feat_enrich$DomainFrequency <= switchOutliers,]
# plot raw data
r <- cor(cancer.feat_enrich$MutFreq/cancer.feat_enrich$DomainFrequency,
cancer.feat_enrich$SwitchFreq/cancer.feat_enrich$DomainFrequency)
p <- ggplot(cancer.feat_enrich,aes(MutFreq/DomainFrequency,SwitchFreq/DomainFrequency)) +
geom_point() +
smartas_theme() +
geom_smooth(method=lm) +
geom_text(x=4,y=3,label=paste0("R = ",round(r,2)))
print(p)
}
## brca
## coad
## hnsc
## kich
## kirc
## kirp
## lihc
## luad
## lusc
## prad
## thca
The L-shape seems kept in all the tumor types, reaffirming the mutual exclusion.
In order to see if there is any feature enriched in mutations in general, as we do for individual switches, we want to check if any kind of domain is frequently mutated/switched more than expected.
Below are plotted the with an adjusted p-value in either mutations or switches < 0.05, and a frequency higher than 0 in both axis. Also, tables with the top 10 domains ordered by mutation enrichment p-value, and filtered by adjusted p of enrichment in switches < 0.05.
for (cancer in cancerTypes){
cat(paste0(cancer,"\n\n"))
cancer.feat_enrich <- feat_enrich[feat_enrich$Cancer==cancer,]
# binomial tests to mutation affection and switch affection
cancer.feat_enrich$p_mut <- apply(cancer.feat_enrich[,c("MutIn","AllMuts","DomainFrequency")],1, function(x){
p <- binom.test(x[1],x[2],x[3],"greater")
p$p.value
} )
cancer.feat_enrich$adjp_mut <- p.adjust(cancer.feat_enrich$p_mut)
cancer.feat_enrich$p_swt <- apply(cancer.feat_enrich[,c("SwitchesIn","AllSwitches","DomainFrequency")],1, function(x){
p <- binom.test(x[1],x[2],x[3],"greater")
p$p.value
} )
cancer.feat_enrich$adjp_swt <- p.adjust(cancer.feat_enrich$p_swt)
# plot domains that are enriched either by mutation affection or by switch affection
cancer.feat_enrich <- cancer.feat_enrich[cancer.feat_enrich$adjp_swt < 0.05 | cancer.feat_enrich$adjp_mut < 0.05,]
cancer.feat_enrich <- subset(cancer.feat_enrich, MutFreq != 0 & SwitchFreq != 0)
r <- cor(cancer.feat_enrich$MutFreq/cancer.feat_enrich$DomainFrequency,
cancer.feat_enrich$SwitchFreq/cancer.feat_enrich$DomainFrequency)
p <- ggplot(cancer.feat_enrich,aes(MutFreq/DomainFrequency,SwitchFreq/DomainFrequency)) +
geom_point() +
smartas_theme() +
geom_smooth(method=lm) +
geom_text(x=4,y=50,label=paste0("R = ",round(r,2)))
print(p)
cat("\n")
# show top 10 domains affected by mutations
df <- cancer.feat_enrich[order(cancer.feat_enrich$p_mut),c("Domain","adjp_mut","adjp_swt")]
df <- df[df$adjp_swt < 0.05,]
# prepare colums for print
df$Domain <- gsub("|"," - ",df$Domain,fixed = T)
df$Domain <- gsub("_"," ",df$Domain,fixed = T)
df$adjp_mut <- format(df$adjp_mut,scientific=TRUE)
df$adjp_swt <- format(df$adjp_swt,scientific=TRUE)
print(kable(head(df,n = 10),row.names = F, digits=3))
cat("\n")
}
brca
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PF00520 - Ion transport protein | 4.671896e-16 | 1.688134e-09 |
| PS50057 - FERM 3 FERM domain profile. | 1.178576e-09 | 2.309237e-18 |
| PF05622 - HOOK protein | 2.251436e-07 | 4.168274e-02 |
| PF00664 - ABC transporter transmembrane region | 2.597141e-07 | 5.111705e-20 |
| PF00176 - SNF2 family N-terminal domain | 2.561457e-05 | 8.680972e-17 |
| PS51420 - RHO small GTPase Rho family profile. | 1.003748e-04 | 1.219748e-151 |
| PF01401 - Angiotensin-converting enzyme | 1.584452e-04 | 8.056486e-102 |
| PF00930 - Dipeptidyl peptidase IV (DPP IV) N-terminal region | 8.480104e-03 | 1.319902e-05 |
| PF00155 - Aminotransferase class I and II | 1.393616e-02 | 3.566352e-27 |
| PF01044 - Vinculin family | 8.568616e-02 | 7.459343e-188 |
coad
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PS51420 - RHO small GTPase Rho family profile. | 1.914362e-49 | 9.061998e-64 |
| PF00071 - Ras family | 9.653021e-38 | 1.068170e-20 |
| PF07714 - Protein tyrosine kinase | 8.660904e-26 | 3.151869e-06 |
| PF00092 - von Willebrand factor type A domain | 2.547325e-07 | 1.268234e-126 |
| PF01044 - Vinculin family | 9.410725e-06 | 7.039743e-04 |
| PF00664 - ABC transporter transmembrane region | 1.074739e-05 | 2.157057e-30 |
| PF00860 - Permease family | 2.079056e-05 | 1.034698e-26 |
| PS50323 - ARG RICH Arginine-rich region profile. | 5.695313e-05 | 1.112492e-09 |
| PF00155 - Aminotransferase class I and II | 3.099756e-04 | 9.086261e-18 |
| PF11467 - Lens epithelium-derived growth factor (LEDGF) | 4.935432e-04 | 1.955142e-63 |
hnsc
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PF07714 - Protein tyrosine kinase | 7.527393e-11 | 5.043864e-10 |
| PS51082 - WH2 WH2 domain profile. | 5.671464e-10 | 7.649944e-10 |
| PF00102 - Protein-tyrosine phosphatase | 1.009955e-09 | 4.028493e-13 |
| PS51421 - RAS small GTPase Ras family profile. | 1.058633e-08 | 2.081999e-02 |
| PS51420 - RHO small GTPase Rho family profile. | 1.512579e-06 | 1.395282e-28 |
| PS50311 - CYS RICH Cysteine-rich region profile. | 3.178261e-06 | 4.724688e-05 |
| PF01044 - Vinculin family | 6.473548e-02 | 7.319428e-14 |
| PF05622 - HOOK protein | 5.946707e-01 | 5.813663e-04 |
| PF01663 - Type I phosphodiesterase / nucleotide pyrophosphatase | 1.000000e+00 | 1.391906e-22 |
| PF06920 - Dedicator of cytokinesis | 1.000000e+00 | 2.478373e-04 |
kich
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PF07992 - Pyridine nucleotide-disulphide oxidoreductase | 6.037136e-02 | 3.015943e-06 |
| PF00076 - RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain) | 1.000000e+00 | 7.308922e-08 |
| PF01663 - Type I phosphodiesterase / nucleotide pyrophosphatase | 1.000000e+00 | 4.983561e-02 |
| PS50324 - SER RICH Serine-rich region profile. | 1.000000e+00 | 3.892985e-04 |
| PF00675 - Insulinase (Peptidase family M16) | 1.000000e+00 | 6.961109e-14 |
| PS50057 - FERM 3 FERM domain profile. | 1.000000e+00 | 4.826861e-04 |
| PF09286 - Pro-kumamolisin, activation domain | 1.000000e+00 | 4.935347e-02 |
| PF01044 - Vinculin family | 1.000000e+00 | 2.801069e-07 |
| PS51676 - FF FF domain profile. | 1.000000e+00 | 4.464125e-10 |
| PF05193 - Peptidase M16 inactive domain | 1.000000e+00 | 3.818177e-11 |
kirc
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PF07714 - Protein tyrosine kinase | 2.036249e-09 | 1.016402e-13 |
| PF00155 - Aminotransferase class I and II | 9.682849e-03 | 1.996727e-13 |
| PS51082 - WH2 WH2 domain profile. | 1.110233e-01 | 2.125206e-05 |
| PS50313 - GLU RICH Glutamic acid-rich region profile. | 5.381470e-01 | 2.693851e-63 |
| PS50929 - ABC TM1F ABC transporter integral membrane type-1 fused domain profile. | 6.229252e-01 | 7.447108e-19 |
| PF07992 - Pyridine nucleotide-disulphide oxidoreductase | 7.838221e-01 | 6.789876e-71 |
| PF12710 - haloacid dehalogenase-like hydrolase | 1.000000e+00 | 8.972374e-42 |
| PS50057 - FERM 3 FERM domain profile. | 1.000000e+00 | 1.761147e-65 |
| PS50311 - CYS RICH Cysteine-rich region profile. | 1.000000e+00 | 8.060443e-222 |
| PF05622 - HOOK protein | 1.000000e+00 | 3.446829e-14 |
kirp
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PF00648 - Calpain family cysteine protease | 1.975373e-01 | 1.889482e-16 |
| PS50203 - CALPAIN CAT Cysteine proteinase, calpain-type, catalytic domain profile. | 1.975373e-01 | 1.889482e-16 |
| PS51420 - RHO small GTPase Rho family profile. | 1.000000e+00 | 1.384471e-82 |
| PF01268 - Formate–tetrahydrofolate ligase | 1.000000e+00 | 2.420897e-02 |
| PF00092 - von Willebrand factor type A domain | 1.000000e+00 | 7.937150e-05 |
| PS50086 - TBC RABGAP TBC/rab GAP domain profile. | 1.000000e+00 | 4.109033e-18 |
| PS51082 - WH2 WH2 domain profile. | 1.000000e+00 | 1.989953e-65 |
| PF12775 - P-loop containing dynein motor region D3 | 1.000000e+00 | 8.950238e-67 |
| PF01401 - Angiotensin-converting enzyme | 1.000000e+00 | 1.126612e-24 |
| PF01044 - Vinculin family | 1.000000e+00 | 5.680443e-37 |
lihc
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PS51421 - RAS small GTPase Ras family profile. | 1.560436e-04 | 2.241767e-15 |
| PF00664 - ABC transporter transmembrane region | 3.882170e-03 | 3.068473e-16 |
| PF08423 - Rad51 | 1.287155e-02 | 4.909633e-02 |
| PF07888 - Calcium binding and coiled-coil domain (CALCOCO1) like | 2.119658e-02 | 2.685803e-26 |
| PS51420 - RHO small GTPase Rho family profile. | 9.121138e-01 | 5.524465e-26 |
| PS50313 - GLU RICH Glutamic acid-rich region profile. | 1.000000e+00 | 4.860473e-18 |
| PF00405 - Transferrin | 1.000000e+00 | 2.847513e-03 |
| PF00108 - Thiolase, N-terminal domain | 1.000000e+00 | 3.158232e-05 |
| PF02390 - Putative methyltransferase | 1.000000e+00 | 3.537496e-04 |
| PS51625 - SAM MT TRMB SAM-dependent methyltransferase TRMB-type domain profile. | 1.000000e+00 | 3.537496e-04 |
luad
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PS51420 - RHO small GTPase Rho family profile. | 1.205195e-68 | 2.920173e-122 |
| PF00071 - Ras family | 5.749245e-54 | 5.195937e-95 |
| PS50835 - IG LIKE Ig-like domain profile. | 2.203623e-34 | 5.443420e-11 |
| PF00102 - Protein-tyrosine phosphatase | 1.124295e-26 | 4.774681e-11 |
| PF00176 - SNF2 family N-terminal domain | 4.752054e-18 | 1.341321e-08 |
| PF07690 - Major Facilitator Superfamily | 5.620283e-16 | 1.514242e-03 |
| PF00664 - ABC transporter transmembrane region | 1.033038e-09 | 2.634336e-35 |
| PF00155 - Aminotransferase class I and II | 8.311274e-08 | 2.538515e-10 |
| PF00685 - Sulfotransferase domain | 9.713705e-08 | 2.526459e-29 |
| PF02181 - Formin Homology 2 Domain | 1.961319e-07 | 1.074291e-32 |
lusc
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PS50835 - IG LIKE Ig-like domain profile. | 3.080884e-06 | 4.772383e-05 |
| PF00155 - Aminotransferase class I and II | 1.274941e-04 | 1.798746e-44 |
| PS51420 - RHO small GTPase Rho family profile. | 1.345727e-04 | 9.741149e-125 |
| PF00176 - SNF2 family N-terminal domain | 3.717827e-04 | 7.568047e-107 |
| PF00685 - Sulfotransferase domain | 1.196416e-03 | 7.695599e-82 |
| PF00102 - Protein-tyrosine phosphatase | 2.009468e-03 | 6.420706e-78 |
| PS50188 - B302 SPRY B30.2/SPRY domain profile. | 7.419627e-02 | 2.963774e-75 |
| PS50261 - G PROTEIN RECEP F2 4 G-protein coupled receptors family 2 profile 2. | 2.116020e-01 | 9.271158e-13 |
| PF02181 - Formin Homology 2 Domain | 9.357077e-01 | 6.459373e-16 |
| PF12026 - Domain of unknown function (DUF3513) | 9.377455e-01 | 1.112088e-32 |
prad
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PS01179 - PID Phosphotyrosine interaction domain (PID) profile. | 4.587847e-01 | 2.087310e-23 |
| PS51421 - RAS small GTPase Ras family profile. | 7.284521e-01 | 1.176410e-24 |
| PS50323 - ARG RICH Arginine-rich region profile. | 1.000000e+00 | 3.030737e-29 |
| PS50311 - CYS RICH Cysteine-rich region profile. | 1.000000e+00 | 6.568654e-83 |
| PF00640 - Phosphotyrosine interaction domain (PTB/PID) | 1.000000e+00 | 7.764041e-25 |
| PF00149 - Calcineurin-like phosphoesterase | 1.000000e+00 | 7.861569e-14 |
| PS51420 - RHO small GTPase Rho family profile. | 1.000000e+00 | 5.711423e-85 |
| PS51082 - WH2 WH2 domain profile. | 1.000000e+00 | 1.470825e-32 |
| PS50315 - GLY RICH Glycine-rich region profile. | 1.000000e+00 | 1.121693e-22 |
| PF01154 - Hydroxymethylglutaryl-coenzyme A synthase N terminal | 1.000000e+00 | 3.262760e-44 |
thca
| Domain | adjp_mut | adjp_swt |
|---|---|---|
| PS51420 - RHO small GTPase Rho family profile. | 1.496503e-57 | 5.195299e-96 |
| PS51421 - RAS small GTPase Ras family profile. | 2.231248e-51 | 4.039399e-15 |
| PF00082 - Subtilase family | 1.000000e+00 | 1.043974e-10 |
| PF03028 - Dynein heavy chain and region D6 of dynein motor | 1.000000e+00 | 8.485220e-05 |
| PS51433 - PNT Pointed (PNT) domain profile. | 1.000000e+00 | 2.390115e-86 |
| PF05193 - Peptidase M16 inactive domain | 1.000000e+00 | 9.171830e-12 |
| PF00566 - Rab-GTPase-TBC domain | 1.000000e+00 | 7.638234e-21 |
| PS50929 - ABC TM1F ABC transporter integral membrane type-1 fused domain profile. | 1.000000e+00 | 2.761762e-80 |
| PS50313 - GLU RICH Glutamic acid-rich region profile. | 1.000000e+00 | 2.382536e-02 |
| PS50255 - CYTOCHROME B5 2 Cytochrome b5 family, heme-binding domain profile. | 1.000000e+00 | 5.048125e-21 |
Kinase domains are frequently mutated, but not that frequently switched, except in some cases (PF07714). Also, there are some recurrent domains such as (RAS small GTPase and RHO small GTPase).
Although there seems to be a correlation between tumor type and domain affected, it is interesting to aggregate the results to find out if the important domains are kept.
feat_enrich_agg.enrichment <- feat_enrich_agg
feat_enrich_agg.enrichment$p_mut <- apply(feat_enrich_agg.enrichment[,c("MutIn","AllMuts","DomainFrequency")],1, function(x){
p <- binom.test(x[1],x[2],x[3],"greater")
p$p.value
} )
feat_enrich_agg.enrichment$adjp_mut <- p.adjust(feat_enrich_agg.enrichment$p_mut)
feat_enrich_agg.enrichment$p_swt <- apply(feat_enrich_agg.enrichment[,c("SwitchesIn","AllSwitches","DomainFrequency")],1, function(x){
p <- binom.test(x[1],x[2],x[3],"greater")
p$p.value
} )
feat_enrich_agg.enrichment$adjp_swt <- p.adjust(feat_enrich_agg.enrichment$p_swt)
# filter out no significant and only affected by one mechanism
df <- feat_enrich_agg.enrichment[feat_enrich_agg.enrichment$adjp_swt < 0.05 | feat_enrich_agg.enrichment$adjp_mut < 0.05,]
df <- subset(df, MutFreq != 0 & SwitchFreq != 0)
r <- cor(df$MutFreq/df$DomainFrequency,df$SwitchFreq/df$DomainFrequency)
ggplot(df,aes(MutFreq/DomainFrequency,SwitchFreq/DomainFrequency)) +
geom_point() +
smartas_theme() +
geom_smooth(method=lm) +
geom_text(x=4,y=50,label=paste0("R = ",round(r,2)))
# show top 50 domains affected by mutations
df <- feat_enrich_agg.enrichment[order(feat_enrich_agg.enrichment$p_mut),c("Domain","adjp_mut","adjp_swt","MutFreq","SwitchFreq","DomainFrequency")]
df <- df[df$adjp_mut < 0.05 | df$adjp_swt < 0.05,]
# prepare colums for print
df$Domain <- gsub("|"," - ",df$Domain,fixed = T)
df$Domain <- gsub("_"," ",df$Domain,fixed = T)
df$adjp_mut <- format(df$adjp_mut,digits=3)
df$adjp_swt <- format(df$adjp_swt,digits=3)
df$fc_mut <- round(df$MutFreq/df$DomainFrequency,2)
df$fc_swt <- round(df$SwitchFreq/df$DomainFrequency,2)
df <- df[order(-df$fc_mut),]
datatable(df[,c("Domain","adjp_mut","adjp_swt","fc_mut","fc_swt")])
We want to be able to rank domains based on mutations, and to know if that is possible.